# Check the file path
here("nhanes3.rda")
## [1] "/cloud/project/nhanes3.rda"
# Load the saved R data
load(here("nhanes3.rda"))
# Remake a few variables from last class if they are no longer in your environment
sex1 <- factor(nhanes$sex, levels = c(1, 2), labels = c("male", "female"))
AGE5b <- cut(nhanes$age, quantile(nhanes$age, c(0, .2, .4, .6, .8, 1)), include.lowest = T) # quintiles
AGE5c <- cut(nhanes$age, breaks = c(19, 40, 50, 60, 70, 90))
age5c <- unclass(AGE5c)
nhanes <- cbind(nhanes, sex1, AGE5b, AGE5c, age5c)
# Clean up the dataset. Make sure NaN are NA for key covariates
table(nhanes$educ, useNA = "always")
##
## 1 2 3 NaN <NA>
## 2032 2349 660 33 0
nhanes$educ[is.nan(nhanes$educ)] <- NA
table(nhanes$educ, useNA = "always")
##
## 1 2 3 <NA>
## 2032 2349 660 33
table(nhanes$alc, useNA = "always")
##
## 0 1 NaN <NA>
## 2753 2189 132 0
nhanes$alc[is.nan(nhanes$alc)] <- NA
table(nhanes$alc, useNA = "always")
##
## 0 1 <NA>
## 2753 2189 132
## Does the distribution of log(sbp) look closer to the normal distribution?
par(mfrow = c(2, 1))
hist(nhanes$sbp, nclass = 20, col = "darkorchid")
hist(log(nhanes$sbp), nclass = 20, col = "seagreen3")
## Let's start with non-log transformed SBP first. Look at bivariate association between sbp and continuous covariates
par(mfrow = c(1, 1))
plot(nhanes$age, nhanes$sbp, pch = 20, cex = 0.7, col = "dodgerblue", cex.lab = 1.2, las = 1, ylab = "SBP", xlab = "Age (years)")
lines(smooth.spline(nhanes$age, nhanes$sbp, df = 10), col = "dodgerblue", lwd = 3)
plot(nhanes$bmi, nhanes$sbp, pch = 20, cex = 0.7, col = "dodgerblue", cex.lab = 1.2, las = 1, ylab = "SBP", xlab = "BMI (kg/m2)")
lines(smooth.spline(na.omit(nhanes$bmi), nhanes$sbp[na.omit(nhanes$bmi)], df = 3, ), col = "grey60", lwd = 3)
plot(nhanes$bpb, nhanes$sbp, pch = 20, cex = 0.7, col = "dodgerblue", cex.lab = 1.2, las = 1, ylab = "SBP", xlab = "Blood lead level (ug/dL)")
lines(smooth.spline(nhanes$bpb, nhanes$sbp, df = 10), col = "grey60", lwd = 3)
## Start creating a simple regression model for systolic blood pressure,
## including only blood lead (bpb) (crude model).
sbp.model <- lm(sbp ~ bpb, na.action = na.omit, data = nhanes)
sbp.model
##
## Call:
## lm(formula = sbp ~ bpb, data = nhanes, na.action = na.omit)
##
## Coefficients:
## (Intercept) bpb
## 121.666 1.162
summary(sbp.model) # Is blood lead associated with SBP? Is this the direction you would expect?
##
## Call:
## lm(formula = sbp ~ bpb, data = nhanes, na.action = na.omit)
##
## Residuals:
## Min 1Q Median 3Q Max
## -51.586 -13.964 -3.757 10.612 114.521
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 121.66581 0.43552 279.36 <2e-16 ***
## bpb 1.16166 0.08528 13.62 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 19.6 on 5072 degrees of freedom
## Multiple R-squared: 0.03529, Adjusted R-squared: 0.0351
## F-statistic: 185.6 on 1 and 5072 DF, p-value: < 2.2e-16
summary.aov(sbp.model)
## Df Sum Sq Mean Sq F value Pr(>F)
## bpb 1 71318 71318 185.6 <2e-16 ***
## Residuals 5072 1949387 384
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
anova(sbp.model)
## Analysis of Variance Table
##
## Response: sbp
## Df Sum Sq Mean Sq F value Pr(>F)
## bpb 1 71318 71318 185.56 < 2.2e-16 ***
## Residuals 5072 1949387 384
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## Add age in the model
sbp.model1 <- lm(sbp ~ bpb + age, na.action = na.omit, data = nhanes)
summary(sbp.model1) # Does anything change with age in the model?
##
## Call:
## lm(formula = sbp ~ bpb + age, data = nhanes, na.action = na.omit)
##
## Residuals:
## Min 1Q Median 3Q Max
## -62.405 -10.431 -1.427 8.470 101.240
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 95.00660 0.63062 150.655 < 2e-16 ***
## bpb 0.46641 0.07063 6.604 4.41e-11 ***
## age 0.60338 0.01181 51.077 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 15.93 on 5071 degrees of freedom
## Multiple R-squared: 0.363, Adjusted R-squared: 0.3628
## F-statistic: 1445 on 2 and 5071 DF, p-value: < 2.2e-16
## Add race, which should be a categorical variable. We can use factor() or as.factor() to create indicator variables for each value of race
table(nhanes$race)
##
## 1 2
## 3634 1440
sbp.model2 <- lm(sbp ~ bpb + age + factor(race), na.action = na.omit, data = nhanes)
summary(sbp.model2)
##
## Call:
## lm(formula = sbp ~ bpb + age + factor(race), data = nhanes, na.action = na.omit)
##
## Residuals:
## Min 1Q Median 3Q Max
## -61.773 -10.420 -1.257 8.525 101.680
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 93.88241 0.66373 141.446 < 2e-16 ***
## bpb 0.42848 0.07080 6.052 1.53e-09 ***
## age 0.61400 0.01195 51.378 < 2e-16 ***
## factor(race)2 2.66690 0.50308 5.301 1.20e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 15.89 on 5070 degrees of freedom
## Multiple R-squared: 0.3665, Adjusted R-squared: 0.3661
## F-statistic: 977.8 on 3 and 5070 DF, p-value: < 2.2e-16
#### Change the reference level for a factor variable
# Method 1: Make a new (or alter the old) variable
table(nhanes$race)
##
## 1 2
## 3634 1440
race2 <- relevel(factor(nhanes$race), ref = 2)
table(race2)
## race2
## 2 1
## 1440 3634
sbp.model2 <- lm(sbp ~ bpb + age + factor(race2), na.action = na.omit, data = nhanes)
summary(sbp.model2)
##
## Call:
## lm(formula = sbp ~ bpb + age + factor(race2), data = nhanes,
## na.action = na.omit)
##
## Residuals:
## Min 1Q Median 3Q Max
## -61.773 -10.420 -1.257 8.525 101.680
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 96.54931 0.69301 139.319 < 2e-16 ***
## bpb 0.42848 0.07080 6.052 1.53e-09 ***
## age 0.61400 0.01195 51.378 < 2e-16 ***
## factor(race2)1 -2.66690 0.50308 -5.301 1.20e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 15.89 on 5070 degrees of freedom
## Multiple R-squared: 0.3665, Adjusted R-squared: 0.3661
## F-statistic: 977.8 on 3 and 5070 DF, p-value: < 2.2e-16
# Method 2: Change the contrasts in the lm() statement
sbp.model2 <- lm(sbp ~ bpb + age + C(factor(race), contr.treatment(2, base = 2)), na.action = na.omit, data = nhanes)
summary(sbp.model2)
##
## Call:
## lm(formula = sbp ~ bpb + age + C(factor(race), contr.treatment(2,
## base = 2)), data = nhanes, na.action = na.omit)
##
## Residuals:
## Min 1Q Median 3Q Max
## -61.773 -10.420 -1.257 8.525 101.680
##
## Coefficients:
## Estimate Std. Error t value
## (Intercept) 96.54931 0.69301 139.319
## bpb 0.42848 0.07080 6.052
## age 0.61400 0.01195 51.378
## C(factor(race), contr.treatment(2, base = 2))1 -2.66690 0.50308 -5.301
## Pr(>|t|)
## (Intercept) < 2e-16 ***
## bpb 1.53e-09 ***
## age < 2e-16 ***
## C(factor(race), contr.treatment(2, base = 2))1 1.20e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 15.89 on 5070 degrees of freedom
## Multiple R-squared: 0.3665, Adjusted R-squared: 0.3661
## F-statistic: 977.8 on 3 and 5070 DF, p-value: < 2.2e-16
# R provides Type I sequential SS, not the default Type III marginal SS reported by SAS and SPSS.
# We can use the drop1() function to produce the familiar Type III results.
# It will compare each term with the full model.
anova(sbp.model2)
## Analysis of Variance Table
##
## Response: sbp
## Df Sum Sq Mean Sq F value
## bpb 1 71318 71318 282.470
## age 1 662217 662217 2622.848
## C(factor(race), contr.treatment(2, base = 2)) 1 7095 7095 28.102
## Residuals 5070 1280075 252
## Pr(>F)
## bpb < 2.2e-16 ***
## age < 2.2e-16 ***
## C(factor(race), contr.treatment(2, base = 2)) 1.199e-07 ***
## Residuals
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
drop1(sbp.model2, test = "F")
## Single term deletions
##
## Model:
## sbp ~ bpb + age + C(factor(race), contr.treatment(2, base = 2))
## Df Sum of Sq RSS AIC
## <none> 1280075 28070
## bpb 1 9247 1289322 28104
## age 1 666469 1946544 30195
## C(factor(race), contr.treatment(2, base = 2)) 1 7095 1287170 28096
## F value Pr(>F)
## <none>
## bpb 36.625 1.535e-09 ***
## age 2639.688 < 2.2e-16 ***
## C(factor(race), contr.treatment(2, base = 2)) 28.102 1.199e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## Add other covariates to the model: which variables are biologically important?
## Add sex, BMI, educ and smk.
sbp.model3 <- lm(sbp ~ bpb + age + factor(race) + factor(sex) + bmi + factor(educ) + factor(smk), na.action = na.omit, data = nhanes)
summary(sbp.model3) # What covariates are associated with SBP? Do they make sense biologically?
##
## Call:
## lm(formula = sbp ~ bpb + age + factor(race) + factor(sex) + bmi +
## factor(educ) + factor(smk), data = nhanes, na.action = na.omit)
##
## Residuals:
## Min 1Q Median 3Q Max
## -59.202 -9.583 -1.317 7.926 101.116
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 83.160324 1.346404 61.765 < 2e-16 ***
## bpb 0.277763 0.075759 3.666 0.000249 ***
## age 0.615200 0.012258 50.189 < 2e-16 ***
## factor(race)2 2.024195 0.498277 4.062 4.93e-05 ***
## factor(sex)2 -3.912620 0.476734 -8.207 2.85e-16 ***
## bmi 0.530437 0.038131 13.911 < 2e-16 ***
## factor(educ)2 -0.003008 0.484418 -0.006 0.995046
## factor(educ)3 -2.675500 0.711182 -3.762 0.000170 ***
## factor(smk)2 -1.991441 0.562961 -3.537 0.000408 ***
## factor(smk)3 -0.270575 0.559123 -0.484 0.628459
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 15.45 on 5021 degrees of freedom
## (43 observations deleted due to missingness)
## Multiple R-squared: 0.3977, Adjusted R-squared: 0.3966
## F-statistic: 368.4 on 9 and 5021 DF, p-value: < 2.2e-16
# See output from models 1,2,and 3 side by side
stargazer(sbp.model1, sbp.model2, sbp.model3, type = "text", dep.var.labels = "Systolic Blood Pressure (mmHg)")
##
## ==============================================================================================================================
## Dependent variable:
## -------------------------------------------------------------------------------
## Systolic Blood Pressure (mmHg)
## (1) (2) (3)
## ------------------------------------------------------------------------------------------------------------------------------
## bpb 0.466*** 0.428*** 0.278***
## (0.071) (0.071) (0.076)
##
## age 0.603*** 0.614*** 0.615***
## (0.012) (0.012) (0.012)
##
## C(factor(race), contr.treatment(2, base = 2))1 -2.667***
## (0.503)
##
## factor(race)2 2.024***
## (0.498)
##
## factor(sex)2 -3.913***
## (0.477)
##
## bmi 0.530***
## (0.038)
##
## factor(educ)2 -0.003
## (0.484)
##
## factor(educ)3 -2.675***
## (0.711)
##
## factor(smk)2 -1.991***
## (0.563)
##
## factor(smk)3 -0.271
## (0.559)
##
## Constant 95.007*** 96.549*** 83.160***
## (0.631) (0.693) (1.346)
##
## ------------------------------------------------------------------------------------------------------------------------------
## Observations 5,074 5,074 5,031
## R2 0.363 0.367 0.398
## Adjusted R2 0.363 0.366 0.397
## Residual Std. Error 15.932 (df = 5071) 15.890 (df = 5070) 15.445 (df = 5021)
## F Statistic 1,444.936*** (df = 2; 5071) 977.807*** (df = 3; 5070) 368.374*** (df = 9; 5021)
## ==============================================================================================================================
## Note: *p<0.1; **p<0.05; ***p<0.01
### Check if alcohol consumption is a confounder
sbp.model4 <- lm(sbp ~ bpb + age + factor(race) + factor(sex) + bmi + factor(educ) + factor(smk) + factor(alc), na.action = na.omit, data = nhanes)
summary(sbp.model4) # Is alcohol associated with sbp?
##
## Call:
## lm(formula = sbp ~ bpb + age + factor(race) + factor(sex) + bmi +
## factor(educ) + factor(smk) + factor(alc), data = nhanes,
## na.action = na.omit)
##
## Residuals:
## Min 1Q Median 3Q Max
## -58.495 -9.580 -1.311 7.928 101.235
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 82.33776 1.41399 58.231 < 2e-16 ***
## bpb 0.24874 0.07705 3.228 0.001253 **
## age 0.61361 0.01276 48.106 < 2e-16 ***
## factor(race)2 2.00975 0.50657 3.967 7.37e-05 ***
## factor(sex)2 -3.87208 0.49390 -7.840 5.50e-15 ***
## bmi 0.55795 0.03860 14.454 < 2e-16 ***
## factor(educ)2 0.02761 0.49227 0.056 0.955279
## factor(educ)3 -2.71562 0.72412 -3.750 0.000179 ***
## factor(smk)2 -2.07944 0.56937 -3.652 0.000263 ***
## factor(smk)3 -0.20883 0.57141 -0.365 0.714779
## factor(alc)1 0.41798 0.49359 0.847 0.397144
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 15.38 on 4891 degrees of freedom
## (172 observations deleted due to missingness)
## Multiple R-squared: 0.3967, Adjusted R-squared: 0.3954
## F-statistic: 321.6 on 10 and 4891 DF, p-value: < 2.2e-16
summary(sbp.model4)$coef # Pull out relevant information from the output: Coefficient table
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 82.3377552 1.41399277 58.23067609 0.000000e+00
## bpb 0.2487418 0.07704799 3.22840133 1.253061e-03
## age 0.6136075 0.01275532 48.10601736 0.000000e+00
## factor(race)2 2.0097527 0.50656503 3.96741297 7.369686e-05
## factor(sex)2 -3.8720755 0.49390486 -7.83971931 5.502517e-15
## bmi 0.5579520 0.03860066 14.45446563 2.100456e-46
## factor(educ)2 0.0276076 0.49227488 0.05608169 9.552790e-01
## factor(educ)3 -2.7156205 0.72411551 -3.75025867 1.786946e-04
## factor(smk)2 -2.0794440 0.56937030 -3.65218209 2.627418e-04
## factor(smk)3 -0.2088304 0.57140734 -0.36546680 7.147788e-01
## factor(alc)1 0.4179774 0.49359253 0.84680655 3.971444e-01
summary(sbp.model4)$coef[2, 1] # Pull out just the beta coefficient blood Pb
## [1] 0.2487418
# 10% guideline for confounders: Does the addition of the new variable change the beta coefficient of interest by >10%?
(summary(sbp.model4)$coef[2, 1] - summary(sbp.model3)$coef[2, 1]) / summary(sbp.model3)$coef[2, 1] # Calculate the percent change in the blood Pb coefficient before and after alcohol in the model
## [1] -0.1044827
# Does alcohol meet the guideline for a confounder?
## Compute difference in SBP and 95% CI per one unit increase and IQR increase in bpb
# For one unit increase in exposure
summary(sbp.model4)$coef[2, 1] # Beta coefficient
## [1] 0.2487418
summary(sbp.model4)$coef[2, 1] - 1.96 * summary(sbp.model4)$coef[2, 2] # Lower confidence interval
## [1] 0.09772778
summary(sbp.model4)$coef[2, 1] + 1.96 * summary(sbp.model4)$coef[2, 2] # Upper confidence interval
## [1] 0.3997559
regress.display(sbp.model4) # Alternative: Convenience function to show all of the CI with the beta coefficients
##
## Coeff lower095ci upper095ci Pr>|t|
## (Intercept) 82.3377552 79.56569429 85.1098161 0.000000e+00
## bpb 0.2487418 0.09769317 0.3997905 1.253061e-03
## age 0.6136075 0.58860131 0.6386136 0.000000e+00
## factor(race)2 2.0097527 1.01665770 3.0028476 7.369686e-05
## factor(sex)2 -3.8720755 -4.84035080 -2.9038001 5.502517e-15
## bmi 0.5579520 0.48227733 0.6336266 2.100456e-46
## factor(educ)2 0.0276076 -0.93747225 0.9926875 9.552790e-01
## factor(educ)3 -2.7156205 -4.13521210 -1.2960289 1.786946e-04
## factor(smk)2 -2.0794440 -3.19566554 -0.9632225 2.627418e-04
## factor(smk)3 -0.2088304 -1.32904543 0.9113846 7.147788e-01
## factor(alc)1 0.4179774 -0.54968566 1.3856404 3.971444e-01
# To improve communication of findings, report association for an IQR increase in exposure
IQR(nhanes$bpb)
## [1] 3.1
IQR(nhanes$bpb) * summary(sbp.model4)$coef[2, 1] # Per IQR increase in exposure, beta coefficient
## [1] 0.7710997
l95ci.iqr <- IQR(nhanes$bpb) * (summary(sbp.model4)$coef[2, 1] - 1.96 * summary(sbp.model4)$coef[2, 2]) # Per IQR increase in exposure, lower confidence interval
u95ci.iqr <- IQR(nhanes$bpb) * (summary(sbp.model4)$coef[2, 1] + 1.96 * summary(sbp.model4)$coef[2, 2]) # Per IQR increase in exposure, upper confidence interval
regress.display(sbp.model4)$table[2, ] * IQR(nhanes$bpb) # Multiply the convenience table by the IQR
## Coeff lower095ci upper095ci Pr>|t|
## 0.77109973 0.30284884 1.23935061 0.00388449
## Does the variable packyrs give a better estimate than smk?
sbp.model5 <- lm(sbp ~ bpb + age + factor(race) + factor(sex) + bmi + factor(educ) + packyrs + factor(alc), na.action = na.omit, data = nhanes)
summary(sbp.model5)
##
## Call:
## lm(formula = sbp ~ bpb + age + factor(race) + factor(sex) + bmi +
## factor(educ) + packyrs + factor(alc), data = nhanes, na.action = na.omit)
##
## Residuals:
## Min 1Q Median 3Q Max
## -59.679 -9.666 -1.233 8.026 101.502
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 82.04513 1.42080 57.746 < 2e-16 ***
## bpb 0.24963 0.07906 3.157 0.001602 **
## age 0.61464 0.01317 46.658 < 2e-16 ***
## factor(race)2 2.22727 0.51892 4.292 1.81e-05 ***
## factor(sex)2 -3.70826 0.50585 -7.331 2.68e-13 ***
## bmi 0.55073 0.03930 14.012 < 2e-16 ***
## factor(educ)2 0.04590 0.50476 0.091 0.927547
## factor(educ)3 -2.60357 0.73817 -3.527 0.000424 ***
## packyrs -0.02462 0.01097 -2.244 0.024860 *
## factor(alc)1 0.32565 0.49934 0.652 0.514333
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 15.42 on 4677 degrees of freedom
## (387 observations deleted due to missingness)
## Multiple R-squared: 0.3952, Adjusted R-squared: 0.394
## F-statistic: 339.6 on 9 and 4677 DF, p-value: < 2.2e-16
AIC(sbp.model4)
## [1] 40722.09
AIC(sbp.model5)
## [1] 38957.68
## Which model has the lower AIC? The one with smk (model 4) or the one with packyrs (model 5)?
## Model 5 also dropped 215 more people. This is an analyst judgment call (no perfect answer). I would probably stick with model 4 since the AIC are close, to keep the people in the study.
## Run two models: One with age and one adding age as a quadratic function.
## Does the quadratic term improve the fit of the model?
sbp.model6 <- lm(sbp ~ bpb + age + factor(race) + factor(sex) + bmi + factor(educ) + factor(smk) + factor(alc) + I(age^2), na.action = na.omit, data = nhanes)
summary(sbp.model6) # Is the age squared term significant?
##
## Call:
## lm(formula = sbp ~ bpb + age + factor(race) + factor(sex) + bmi +
## factor(educ) + factor(smk) + factor(alc) + I(age^2), data = nhanes,
## na.action = na.omit)
##
## Residuals:
## Min 1Q Median 3Q Max
## -59.724 -9.518 -1.413 7.689 101.809
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 87.0142129 1.8737032 46.440 < 2e-16 ***
## bpb 0.2582957 0.0769837 3.355 0.000799 ***
## age 0.3452699 0.0718091 4.808 1.57e-06 ***
## factor(race)2 2.0928803 0.5063451 4.133 3.64e-05 ***
## factor(sex)2 -3.7960138 0.4936354 -7.690 1.77e-14 ***
## bmi 0.5913718 0.0395399 14.956 < 2e-16 ***
## factor(educ)2 0.0989887 0.4919604 0.201 0.840541
## factor(educ)3 -2.4208295 0.7272801 -3.329 0.000879 ***
## factor(smk)2 -1.8118191 0.5729428 -3.162 0.001575 **
## factor(smk)3 0.0969062 0.5762782 0.168 0.866465
## factor(alc)1 0.4773454 0.4931648 0.968 0.333131
## I(age^2) 0.0026033 0.0006856 3.797 0.000148 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 15.36 on 4890 degrees of freedom
## (172 observations deleted due to missingness)
## Multiple R-squared: 0.3984, Adjusted R-squared: 0.3971
## F-statistic: 294.4 on 11 and 4890 DF, p-value: < 2.2e-16
## Use the anova function to compare the 2 models and
## see if the quadratic term improves the model.
anova(sbp.model4, sbp.model6, test = "F")
## Analysis of Variance Table
##
## Model 1: sbp ~ bpb + age + factor(race) + factor(sex) + bmi + factor(educ) +
## factor(smk) + factor(alc)
## Model 2: sbp ~ bpb + age + factor(race) + factor(sex) + bmi + factor(educ) +
## factor(smk) + factor(alc) + I(age^2)
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 4891 1157606
## 2 4890 1154203 1 3403 14.418 0.0001482 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## We'll use sbp.model6 as our final model
## What is the relationship between bpb and sbp?
### Regression Diagnostics
## In the case of linear model, the plot of the model gives diagnostic plots
plot(sbp.model6, which = 1)
plot(sbp.model6, which = 2)
plot(sbp.model6, which = 3)
plot(sbp.model6, which = 4)
plot(sbp.model6, which = 5)
## How about log(sbp)?
## Check how diagnostic plots using log-transformed sbp look like
sbp.model6.log <- lm(log(sbp) ~ bpb + age + factor(race) + factor(sex) + bmi + factor(educ) + factor(smk) + factor(alc) + I(age^2), na.action = na.omit, data = nhanes)
summary(sbp.model6.log)
##
## Call:
## lm(formula = log(sbp) ~ bpb + age + factor(race) + factor(sex) +
## bmi + factor(educ) + factor(smk) + factor(alc) + I(age^2),
## data = nhanes, na.action = na.omit)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.53660 -0.07304 -0.00487 0.06676 0.57499
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.505e+00 1.401e-02 321.638 < 2e-16 ***
## bpb 1.974e-03 5.755e-04 3.430 0.000609 ***
## age 3.281e-03 5.368e-04 6.113 1.06e-09 ***
## factor(race)2 1.535e-02 3.785e-03 4.054 5.11e-05 ***
## factor(sex)2 -3.504e-02 3.690e-03 -9.496 < 2e-16 ***
## bmi 4.864e-03 2.956e-04 16.455 < 2e-16 ***
## factor(educ)2 1.928e-03 3.678e-03 0.524 0.600129
## factor(educ)3 -1.837e-02 5.437e-03 -3.379 0.000734 ***
## factor(smk)2 -1.415e-02 4.283e-03 -3.303 0.000964 ***
## factor(smk)3 5.033e-04 4.308e-03 0.117 0.906992
## factor(alc)1 5.098e-03 3.687e-03 1.383 0.166742
## I(age^2) 1.389e-05 5.125e-06 2.710 0.006758 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1148 on 4890 degrees of freedom
## (172 observations deleted due to missingness)
## Multiple R-squared: 0.4159, Adjusted R-squared: 0.4146
## F-statistic: 316.5 on 11 and 4890 DF, p-value: < 2.2e-16
plot(sbp.model6.log)
hist(residuals(sbp.model6.log), nclass = 20)
par(mfrow = c(2, 2))
plot(sbp.model6, which = 1)
plot(sbp.model6.log, which = 1)
hist(residuals(sbp.model6), main = "Histogram of res(SBP)")
hist(residuals(sbp.model6.log), main = "Histogram of res(log(SBP))")
par(mfrow = c(1, 1))
### Partial Residual Plots
## Plot sbp vs. bpb given that other variables are in the model (adjusted)
## This can be done by 'termplot'
termplot(sbp.model6, partial.resid = TRUE, col.res = "gray30", cex = 0.3, smooth = panel.smooth)
summary(sbp.model6.log)$coef
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.504957e+00 1.400631e-02 321.6375700 0.000000e+00
## bpb 1.973727e-03 5.754686e-04 3.4297737 6.090800e-04
## age 3.281259e-03 5.367875e-04 6.1127716 1.055187e-09
## factor(race)2 1.534593e-02 3.785033e-03 4.0543714 5.105082e-05
## factor(sex)2 -3.504012e-02 3.690025e-03 -9.4959008 3.321831e-21
## bmi 4.863685e-03 2.955687e-04 16.4553462 2.932179e-59
## factor(educ)2 1.927926e-03 3.677504e-03 0.5242485 6.001295e-01
## factor(educ)3 -1.836873e-02 5.436567e-03 -3.3787365 7.338916e-04
## factor(smk)2 -1.414501e-02 4.282864e-03 -3.3026976 9.644763e-04
## factor(smk)3 5.033236e-04 4.307797e-03 0.1168401 9.069915e-01
## factor(alc)1 5.098266e-03 3.686508e-03 1.3829529 1.667424e-01
## I(age^2) 1.388725e-05 5.125029e-06 2.7096926 6.758042e-03
summary(sbp.model6.log)$coef[2, 1]
## [1] 0.001973727
summary(sbp.model6.log)$coef[2, 2]
## [1] 0.0005754686
exp(summary(sbp.model6.log)$coef[2, 1])
## [1] 1.001976
# exp(sbp.model6.log$coef[2])
100 * (exp(summary(sbp.model6.log)$coef[2, 1]) - 1) # Per one unit increase in blood Pb, percent change in systolic blood pressure
## [1] 0.1975676
100 * (exp(summary(sbp.model6.log)$coef[2, 1] - 1.96 * summary(sbp.model6.log)$coef[2, 2]) - 1) # Lower confidence interval
## [1] 0.08461663
100 * (exp(summary(sbp.model6.log)$coef[2, 1] + 1.96 * summary(sbp.model6.log)$coef[2, 2]) - 1) # Upper confidence interval
## [1] 0.310646
IQR(nhanes$bpb)
## [1] 3.1
100 * (exp(IQR(nhanes$bpb) * summary(sbp.model6.log)$coef[2, 1]) - 1) # Per IQR unit increase in blood Pb, percent change in systolic blood pressure
## [1] 0.613731
100 * (exp(IQR(nhanes$bpb) * (summary(sbp.model6.log)$coef[2, 1] - 1.96 * summary(sbp.model6.log)$coef[2, 2])) - 1) # Lower confidence interval
## [1] 0.2625447
100 * (exp(IQR(nhanes$bpb) * (summary(sbp.model6.log)$coef[2, 1] + 1.96 * summary(sbp.model6.log)$coef[2, 2])) - 1) # Upper confidence interval
## [1] 0.9661474
# What if the relationship between blood lead and sbp is non-linear?
library(mgcv)
## Loading required package: nlme
## This is mgcv 1.8-35. For overview type 'help("mgcv-package")'.
##
## Attaching package: 'mgcv'
## The following object is masked from 'package:nnet':
##
## multinom
sbp.model6.gam <- gam(sbp ~ s(bpb) + age + factor(race) + factor(sex) + bmi + factor(educ) + factor(smk) + factor(alc) + I(age^2), na.action = na.omit, data = nhanes)
summary(sbp.model6.gam)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## sbp ~ s(bpb) + age + factor(race) + factor(sex) + bmi + factor(educ) +
## factor(smk) + factor(alc) + I(age^2)
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 88.2143968 1.8617921 47.381 < 2e-16 ***
## age 0.3363437 0.0718470 4.681 2.93e-06 ***
## factor(race)2 2.0313316 0.5062947 4.012 6.11e-05 ***
## factor(sex)2 -3.5783017 0.5001717 -7.154 9.67e-13 ***
## bmi 0.5931173 0.0395125 15.011 < 2e-16 ***
## factor(educ)2 0.1625592 0.4919648 0.330 0.741090
## factor(educ)3 -2.2564780 0.7288554 -3.096 0.001973 **
## factor(smk)2 -1.8333930 0.5724650 -3.203 0.001371 **
## factor(smk)3 -0.0462355 0.5780681 -0.080 0.936254
## factor(alc)1 0.4435799 0.4929664 0.900 0.368262
## I(age^2) 0.0026407 0.0006852 3.854 0.000118 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F p-value
## s(bpb) 2.36 3.014 6.61 0.000184 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## R-sq.(adj) = 0.398 Deviance explained = 40%
## GCV = 236.21 Scale est. = 235.57 n = 4902
plot(sbp.model6.gam)
plot(sbp.model6.gam, xlab = "Blood lead (ug/dL)", ylab = "Change in log(SBP)") # Does this relationship look linear?
# Option to log transform the exposure
sbp.model7 <- lm(sbp ~ log(bpb) + age + factor(race) + factor(sex) + bmi + factor(educ) + factor(smk) + factor(alc) + I(age^2), na.action = na.omit, data = nhanes)
summary(sbp.model7)
##
## Call:
## lm(formula = sbp ~ log(bpb) + age + factor(race) + factor(sex) +
## bmi + factor(educ) + factor(smk) + factor(alc) + I(age^2),
## data = nhanes, na.action = na.omit)
##
## Residuals:
## Min 1Q Median 3Q Max
## -59.464 -9.543 -1.339 7.746 103.034
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 86.8708956 1.8722723 46.399 < 2e-16 ***
## log(bpb) 1.4222563 0.3526023 4.034 5.58e-05 ***
## age 0.3316940 0.0719935 4.607 4.18e-06 ***
## factor(race)2 2.0361714 0.5066138 4.019 5.93e-05 ***
## factor(sex)2 -3.5467680 0.5053567 -7.018 2.55e-12 ***
## bmi 0.5902534 0.0394943 14.945 < 2e-16 ***
## factor(educ)2 0.1417779 0.4914805 0.288 0.77300
## factor(educ)3 -2.2839018 0.7293924 -3.131 0.00175 **
## factor(smk)2 -1.8271820 0.5726692 -3.191 0.00143 **
## factor(smk)3 -0.0474241 0.5795115 -0.082 0.93478
## factor(alc)1 0.4312724 0.4933101 0.874 0.38203
## I(age^2) 0.0026700 0.0006858 3.893 0.00010 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 15.36 on 4890 degrees of freedom
## (172 observations deleted due to missingness)
## Multiple R-squared: 0.399, Adjusted R-squared: 0.3977
## F-statistic: 295.2 on 11 and 4890 DF, p-value: < 2.2e-16
summary(sbp.model6)
##
## Call:
## lm(formula = sbp ~ bpb + age + factor(race) + factor(sex) + bmi +
## factor(educ) + factor(smk) + factor(alc) + I(age^2), data = nhanes,
## na.action = na.omit)
##
## Residuals:
## Min 1Q Median 3Q Max
## -59.724 -9.518 -1.413 7.689 101.809
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 87.0142129 1.8737032 46.440 < 2e-16 ***
## bpb 0.2582957 0.0769837 3.355 0.000799 ***
## age 0.3452699 0.0718091 4.808 1.57e-06 ***
## factor(race)2 2.0928803 0.5063451 4.133 3.64e-05 ***
## factor(sex)2 -3.7960138 0.4936354 -7.690 1.77e-14 ***
## bmi 0.5913718 0.0395399 14.956 < 2e-16 ***
## factor(educ)2 0.0989887 0.4919604 0.201 0.840541
## factor(educ)3 -2.4208295 0.7272801 -3.329 0.000879 ***
## factor(smk)2 -1.8118191 0.5729428 -3.162 0.001575 **
## factor(smk)3 0.0969062 0.5762782 0.168 0.866465
## factor(alc)1 0.4773454 0.4931648 0.968 0.333131
## I(age^2) 0.0026033 0.0006856 3.797 0.000148 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 15.36 on 4890 degrees of freedom
## (172 observations deleted due to missingness)
## Multiple R-squared: 0.3984, Adjusted R-squared: 0.3971
## F-statistic: 294.4 on 11 and 4890 DF, p-value: < 2.2e-16
## compare model6 and model7
AIC(sbp.model6, sbp.model7) # Which model has the lower AIC? The spline term or the log term for blood Pb?
## df AIC
## sbp.model6 13 40709.65
## sbp.model7 13 40704.64
# What if the relationship between blood Pb and sbp varies by sex?
### Try interaction model
sbp.model6.int <- lm(sbp ~ bpb + factor(sex) + bpb * factor(sex) + age + factor(race) + bmi + factor(educ)
+ factor(smk) + factor(alc) + I(age^2), data = nhanes)
# below is the same
sbp.model6.int <- lm(sbp ~ bpb * factor(sex) + age + factor(race) + bmi + factor(educ)
+ factor(smk) + factor(alc) + I(age^2), data = nhanes)
summary(sbp.model6.int) # Is the interaction term significant?
##
## Call:
## lm(formula = sbp ~ bpb * factor(sex) + age + factor(race) + bmi +
## factor(educ) + factor(smk) + factor(alc) + I(age^2), data = nhanes)
##
## Residuals:
## Min 1Q Median 3Q Max
## -58.894 -9.530 -1.369 7.642 102.461
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 87.8854794 1.9054676 46.123 < 2e-16 ***
## bpb 0.1317420 0.0923617 1.426 0.153825
## factor(sex)2 -5.1875500 0.7476820 -6.938 4.49e-12 ***
## age 0.3385020 0.0718234 4.713 2.51e-06 ***
## factor(race)2 2.0885416 0.5060825 4.127 3.74e-05 ***
## bmi 0.5905818 0.0395204 14.944 < 2e-16 ***
## factor(educ)2 0.1078127 0.4917151 0.219 0.826458
## factor(educ)3 -2.4434079 0.7269556 -3.361 0.000782 ***
## factor(smk)2 -1.7832085 0.5727587 -3.113 0.001860 **
## factor(smk)3 0.1024602 0.5759802 0.178 0.858818
## factor(alc)1 0.4787710 0.4929064 0.971 0.331436
## I(age^2) 0.0026405 0.0006854 3.853 0.000118 ***
## bpb:factor(sex)2 0.3796814 0.1532848 2.477 0.013284 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 15.36 on 4889 degrees of freedom
## (172 observations deleted due to missingness)
## Multiple R-squared: 0.3992, Adjusted R-squared: 0.3977
## F-statistic: 270.7 on 12 and 4889 DF, p-value: < 2.2e-16
### Stratified by sex
table(nhanes$sex)
##
## 1 2
## 2335 2739
## Male (sex==1)
sbp.model6.male <- lm(sbp ~ bpb + age + factor(race) + bmi + factor(educ)
+ factor(smk) + factor(alc) + I(age^2), data = nhanes, subset = (sex == 1))
summary(sbp.model6.male) # What is the beta coefficient for blood Pb in males?
##
## Call:
## lm(formula = sbp ~ bpb + age + factor(race) + bmi + factor(educ) +
## factor(smk) + factor(alc) + I(age^2), data = nhanes, subset = (sex ==
## 1))
##
## Residuals:
## Min 1Q Median 3Q Max
## -55.951 -9.164 -1.214 7.452 67.763
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 94.0698502 2.7370216 34.369 < 2e-16 ***
## bpb 0.2201841 0.0907801 2.425 0.015367 *
## age 0.0247105 0.1020797 0.242 0.808748
## factor(race)2 2.6271948 0.7251217 3.623 0.000298 ***
## bmi 0.7190556 0.0667827 10.767 < 2e-16 ***
## factor(educ)2 -0.1580127 0.6931503 -0.228 0.819696
## factor(educ)3 -1.9583250 0.9739884 -2.011 0.044485 *
## factor(smk)2 -0.2674489 0.7894817 -0.339 0.734818
## factor(smk)3 1.5625824 0.8126909 1.923 0.054641 .
## factor(alc)1 -0.1923135 0.6709470 -0.287 0.774422
## I(age^2) 0.0043354 0.0009734 4.454 8.84e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 14.6 on 2240 degrees of freedom
## (84 observations deleted due to missingness)
## Multiple R-squared: 0.3134, Adjusted R-squared: 0.3103
## F-statistic: 102.3 on 10 and 2240 DF, p-value: < 2.2e-16
## Female (sex==2)
sbp.model6.female <- lm(sbp ~ bpb + age + factor(race) + bmi + factor(educ)
+ factor(smk) + factor(alc) + I(age^2), data = nhanes, subset = (sex == 2))
summary(sbp.model6.female) # What is the beta coefficient for blood Pb in females? Is it similar to that in males?
##
## Call:
## lm(formula = sbp ~ bpb + age + factor(race) + bmi + factor(educ) +
## factor(smk) + factor(alc) + I(age^2), data = nhanes, subset = (sex ==
## 2))
##
## Residuals:
## Min 1Q Median 3Q Max
## -57.975 -9.454 -1.348 7.659 99.869
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 77.0605375 2.4865167 30.991 < 2e-16 ***
## bpb 0.2931960 0.1343263 2.183 0.02914 *
## age 0.5576526 0.0990458 5.630 1.99e-08 ***
## factor(race)2 2.0982646 0.6949672 3.019 0.00256 **
## bmi 0.5305908 0.0492797 10.767 < 2e-16 ***
## factor(educ)2 0.3947007 0.6828269 0.578 0.56329
## factor(educ)3 -2.1735912 1.0641023 -2.043 0.04119 *
## factor(smk)2 -1.4335988 0.8458868 -1.695 0.09023 .
## factor(smk)3 -0.5512632 0.8104050 -0.680 0.49642
## factor(alc)1 0.6533917 0.7063162 0.925 0.35501
## I(age^2) 0.0016206 0.0009452 1.715 0.08655 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 15.66 on 2640 degrees of freedom
## (88 observations deleted due to missingness)
## Multiple R-squared: 0.4611, Adjusted R-squared: 0.4591
## F-statistic: 225.9 on 10 and 2640 DF, p-value: < 2.2e-16
Exercise 6A
## Logistic regression for hypertension
## Look at hypertension status (htn)
tab1(nhanes$htn, graph = F)
## nhanes$htn :
## Frequency Percent Cum. percent
## 0 3135 61.8 61.8
## 1 1939 38.2 100.0
## Total 5074 100.0 100.0
htn.model <- glm(htn ~ bpb + age + factor(sex) + factor(race) + bmi + factor(educ) + factor(smk) + factor(alc),
family = binomial,
na.action = na.omit, data = nhanes)
summary(htn.model)
##
## Call:
## glm(formula = htn ~ bpb + age + factor(sex) + factor(race) +
## bmi + factor(educ) + factor(smk) + factor(alc), family = binomial,
## data = nhanes, na.action = na.omit)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.4998 -0.7949 -0.4272 0.8959 2.5147
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -6.635396 0.261535 -25.371 < 2e-16 ***
## bpb 0.023959 0.011750 2.039 0.0414 *
## age 0.061698 0.002235 27.602 < 2e-16 ***
## factor(sex)2 0.063134 0.077426 0.815 0.4148
## factor(race)2 0.494599 0.080020 6.181 6.37e-10 ***
## bmi 0.097071 0.006285 15.446 < 2e-16 ***
## factor(educ)2 0.075786 0.076618 0.989 0.3226
## factor(educ)3 -0.085220 0.114171 -0.746 0.4554
## factor(smk)2 -0.037067 0.086530 -0.428 0.6684
## factor(smk)3 0.080902 0.091358 0.886 0.3759
## factor(alc)1 0.008554 0.077201 0.111 0.9118
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 6500.8 on 4901 degrees of freedom
## Residual deviance: 5080.9 on 4891 degrees of freedom
## (172 observations deleted due to missingness)
## AIC: 5102.9
##
## Number of Fisher Scoring iterations: 4
# Compute ORs
logistic.display(htn.model)
##
## OR lower95ci upper95ci Pr(>|Z|)
## bpb 1.0242485 1.0009304 1.048110 4.143898e-02
## age 1.0636409 1.0589913 1.068311 1.040118e-167
## factor(sex)2 1.0651698 0.9151960 1.239720 4.148329e-01
## factor(race)2 1.6398400 1.4018069 1.918292 6.372658e-10
## bmi 1.1019384 1.0884485 1.115596 8.036933e-54
## factor(educ)2 1.0787320 0.9283169 1.253519 3.225922e-01
## factor(educ)3 0.9183104 0.7341876 1.148609 4.554123e-01
## factor(smk)2 0.9636111 0.8132928 1.141712 6.683781e-01
## factor(smk)3 1.0842650 0.9065078 1.296879 3.758574e-01
## factor(alc)1 1.0085903 0.8669650 1.173351 9.117772e-01
# Regression diagnostics
par(mfrow = c(2, 2))
plot(htn.model)
par(mfrow = c(1, 1))
termplot(htn.model, se = T, partial.resid = T)
# ## Poisson regression for respiratory death: Montana dataset from epiDisplay
# data(Montana)
# summ(Montana)
# head(Montana, 10)
# hist(Montana$respdeath)
#
# par(mfrow = c(2, 2))
# tab1(Montana$agegr)
# tab1(Montana$period)
# tab1(Montana$start)
# tab1(Montana$arsenic)
#
# ## label categorical variables
# Montana$agegr <- factor(Montana$agegr, labels = c("40-49", "50-59", "60-69", "70-79"))
# Montana$period <- factor(Montana$period, labels = c("1938-1949", "1950-1959", "1960-1969", "1970-1977"))
# Montana$start <- factor(Montana$start, labels = c("pre-1925", "1925 & after"))
# Montana$arsenic <- factor(Montana$arsenic, labels = c("<1 year", "1-4 years", "5-14 years", "15+ years"))
#
# tab1(Montana$agegr, missing = F)
# tab1(Montana$period, missing = F)
# tab1(Montana$start, missing = F)
# tab1(Montana$arsenic, missing = F)
# par(mfrow = c(1, 1))
#
# ## Compute incidence rate by age and period
# table.pyears <- tapply(Montana$personyrs, list(Montana$period, Montana$agegr), sum)
# table.deaths <- tapply(Montana$respdeath, list(Montana$period, Montana$agegr), sum)
# table.inc10000 <- table.deaths / table.pyears * 10000
# table.inc10000
#
# ## create a time-series plot of the incidence
# plot.ts(table.inc10000, plot.type = "single", xlab = "", ylab = "#/10,000 person-years", xaxt = "n", col = c("black", "blue", "red", "green"), lty = c(2, 1, 1, 2), las = 1)
# points(rep(1:4, 4), table.inc10000, pch = 22, cex = table.pyears / sum(table.pyears) * 20)
# title(main = "Incidence by age and period")
# axis(side = 1, at = 1:4, labels = levels(Montana$period))
# legend("topleft", legend = levels(Montana$agegr)[4:1], col = c("green", "red", "blue", "black"), bg = "white", lty = c(2, 1, 1, 2))
#
# ## check arsenic
# tab1(Montana$arsenic)
# tapply(Montana$respdeath, Montana$arsenic, mean)
# tapply(Montana$personyrs, Montana$arsenic, mean)
#
# ## Poisson model
# resp.mode11 <- glm(respdeath ~ period, offset = log(personyrs), family = poisson, data = Montana)
# summary(resp.mode11)
#
# resp.mode12 <- glm(respdeath ~ agegr, offset = log(personyrs), family = poisson, data = Montana)
# summary(resp.mode12)
#
# resp.mode13 <- glm(respdeath ~ period + agegr, offset = log(personyrs), family = poisson, data = Montana)
# summary(resp.mode13)
#
# AIC(resp.mode11, resp.mode12, resp.mode13)
# ## model2 is better
#
# resp.mode14 <- glm(respdeath ~ agegr + arsenic, offset = log(personyrs), family = poisson, data = Montana)
# summary(resp.mode14)
#
# ## is there a linear trend across arsenic exposure?
# resp.mode14.lin <- glm(respdeath ~ agegr + as.numeric(arsenic), offset = log(personyrs), family = poisson, data = Montana)
# summary(resp.mode14.lin)
#
# ## compute IRR
# idr.display(resp.mode14)
Optional: Exercise 6B
# Matched case-control dataset available in the epiDisplay package (packages can contain practice data as well as functions)
data(VC1to1)
summ(VC1to1) # What is the shape/structure of this dataset?
##
## No. of observations = 52
##
## Var. name obs. mean median s.d. min. max.
## 1 matset 52 13.5 13.5 7.57 1 26
## 2 case 52 0.5 0.5 0.5 0 1
## 3 smoking 52 0.81 1 0.4 0 1
## 4 rubber 52 0.33 0 0.47 0 1
## 5 alcohol 52 0.52 1 0.5 0 1
head(VC1to1) # 1 case matched to 1 control
## matset case smoking rubber alcohol
## 1 1 1 1 0 0
## 2 1 0 1 0 0
## 3 2 1 1 0 1
## 4 2 0 1 1 0
## 5 3 1 1 1 0
## 6 3 0 1 1 0
# Reshape the data to facilitate data exploration
# function 'reshape' converts wide to long or vice versa
wide <- reshape(VC1to1, timevar = "case", v.names = c("smoking", "rubber", "alcohol"), idvar = "matset", direction = "wide")
head(wide, 3)
## matset smoking.1 rubber.1 alcohol.1 smoking.0 rubber.0 alcohol.0
## 1 1 1 0 0 1 0 0
## 3 2 1 0 1 1 1 0
## 5 3 1 1 0 1 1 0
table(wide$smoking.1, wide$smoking.0, dnn = c("smoking in case", "smoking in control"))
## smoking in control
## smoking in case 0 1
## 0 0 5
## 1 5 16
# dnn: dimnames names
# matchTab() is for the conditional OR (McNemar's OR)
matchTab(VC1to1$case, VC1to1$smoking, strata = VC1to1$matset) # Is smoking exposure associated with cancer?
##
## Exposure status: $ = 1
##
## Exposure status: VC1to1 = 1
##
## Exposure status: smoking = 1
##
## Total number of match sets in the tabulation = 26
##
## Number of controls = 1
## No. of controls exposed
## No. of cases exposed 0 1
## 0 0 5
## 1 5 16
##
## Odds ratio by Mantel-Haenszel method = 1
##
## Odds ratio by maximum likelihood estimate (MLE) method = 1
## 95%CI= 0.29 , 3.454
##
matchTab(VC1to1$case, VC1to1$rubber, strata = VC1to1$matset) # Is working in the rubber industry associated with cancer?
##
## Exposure status: $ = 1
##
## Exposure status: VC1to1 = 1
##
## Exposure status: rubber = 1
##
## Total number of match sets in the tabulation = 26
##
## Number of controls = 1
## No. of controls exposed
## No. of cases exposed 0 1
## 0 13 5
## 1 4 4
##
## Odds ratio by Mantel-Haenszel method = 0.8
##
## Odds ratio by maximum likelihood estimate (MLE) method = 0.8
## 95%CI= 0.215 , 2.979
##
matchTab(VC1to1$case, VC1to1$alcohol, strata = VC1to1$matset) # Is alcohol consumption associated iwth cancer?
##
## Exposure status: $ = 1
##
## Exposure status: VC1to1 = 1
##
## Exposure status: alcohol = 1
##
## Total number of match sets in the tabulation = 26
##
## Number of controls = 1
## No. of controls exposed
## No. of cases exposed 0 1
## 0 7 2
## 1 9 8
##
## Odds ratio by Mantel-Haenszel method = 4.5
##
## Odds ratio by maximum likelihood estimate (MLE) method = 4.5
## 95%CI= 0.972 , 20.827
##
## look at the full dataset VC1to6
data(VC1to6) # 1 case matched to up to 6 controls
summ(VC1to6)
##
## No. of observations = 119
##
## Var. name obs. mean median s.d. min. max.
## 1 matset 119 15.75 17 6.96 1 26
## 2 case 119 0.22 0 0.41 0 1
## 3 smoking 119 0.7 1 0.46 0 1
## 4 rubber 119 0.37 0 0.48 0 1
## 5 alcohol 119 0.42 0 0.5 0 1
VC1to6[, ]
## matset case smoking rubber alcohol
## 1 1 1 1 0 0
## 2 1 0 1 0 0
## 3 2 1 1 0 1
## 4 2 0 1 1 0
## 5 3 1 1 1 0
## 6 3 0 1 1 0
## 7 4 1 1 0 0
## 8 4 0 1 1 1
## 9 4 0 0 1 1
## 10 5 1 0 0 1
## 11 5 0 1 0 0
## 12 5 0 1 0 0
## 13 6 1 1 0 1
## 14 6 0 0 0 0
## 15 6 0 0 0 0
## 16 7 1 1 0 1
## 17 7 0 1 0 0
## 18 7 0 1 0 1
## 19 7 0 1 1 0
## 20 8 1 1 0 0
## 21 8 0 1 0 0
## 22 8 0 1 0 1
## 23 8 0 0 1 0
## 24 9 1 1 1 1
## 25 9 0 1 1 0
## 26 9 0 1 1 0
## 27 9 0 0 1 0
## 28 10 1 0 0 0
## 29 10 0 1 1 0
## 30 10 0 0 0 0
## 31 10 0 1 0 0
## 32 11 1 1 0 1
## 33 11 0 1 0 1
## 34 11 0 0 0 0
## 35 11 0 1 0 1
## 36 12 1 1 0 0
## 37 12 0 1 0 1
## 38 12 0 1 0 0
## 39 12 0 0 0 0
## 40 13 1 1 1 0
## 41 13 0 0 0 0
## 42 13 0 0 0 0
## 43 13 0 0 0 0
## 44 14 1 1 0 1
## 45 14 0 1 0 1
## 46 14 0 0 0 0
## 47 14 0 1 1 0
## 48 14 0 1 0 1
## 49 15 1 1 0 1
## 50 15 0 1 0 0
## 51 15 0 0 0 0
## 52 15 0 1 0 0
## 53 15 0 1 0 1
## 54 16 1 1 0 1
## 55 16 0 0 0 1
## 56 16 0 1 0 1
## 57 16 0 1 0 1
## 58 16 0 1 1 0
## 59 17 1 1 1 1
## 60 17 0 1 0 1
## 61 17 0 1 1 1
## 62 17 0 1 0 0
## 63 17 0 0 1 0
## 64 18 1 1 0 1
## 65 18 0 1 0 1
## 66 18 0 0 1 0
## 67 18 0 0 1 0
## 68 18 0 1 0 0
## 69 18 0 1 0 1
## 70 19 1 0 0 0
## 71 19 0 1 0 0
## 72 19 0 0 0 0
## 73 19 0 0 0 0
## 74 19 0 0 1 0
## 75 19 0 0 0 0
## 76 20 1 1 1 1
## 77 20 0 0 0 0
## 78 20 0 0 0 0
## 79 20 0 0 0 0
## 80 20 0 1 0 0
## 81 20 0 0 0 0
## 82 21 1 1 1 1
## 83 21 0 1 1 1
## 84 21 0 1 1 1
## 85 21 0 1 0 1
## 86 21 0 1 1 1
## 87 21 0 1 1 1
## 88 21 0 1 1 1
## 89 22 1 0 0 1
## 90 22 0 1 1 1
## 91 22 0 0 0 1
## 92 22 0 1 1 0
## 93 22 0 0 0 0
## 94 22 0 0 1 0
## 95 22 0 1 0 0
## 96 23 1 1 1 1
## 97 23 0 1 1 1
## 98 23 0 1 1 1
## 99 23 0 1 1 0
## 100 23 0 1 1 1
## 101 23 0 1 1 0
## 102 23 0 1 1 0
## 103 24 1 1 1 1
## 104 24 0 1 0 0
## 105 24 0 1 0 0
## 106 24 0 0 1 1
## 107 24 0 1 0 0
## 108 24 0 1 0 1
## 109 24 0 1 0 0
## 110 25 1 0 0 0
## 111 25 0 1 1 0
## 112 25 0 1 1 1
## 113 25 0 1 0 1
## 114 25 0 1 0 0
## 115 26 1 1 0 1
## 116 26 0 0 0 0
## 117 26 0 1 1 0
## 118 26 0 0 0 0
## 119 26 0 1 1 1
# what is the effect of smoking?
matchTab(VC1to6$case, VC1to6$smoking, strata = VC1to6$matset) # Is smoking exposure associated with cancer?
##
## Exposure status: $ = 1
##
## Exposure status: VC1to6 = 1
##
## Exposure status: smoking = 1
##
## Total number of match sets in the tabulation = 26
##
## Number of controls = 1
## No. of controls exposed
## No. of cases exposed 0 1
## 0 0 0
## 1 0 3
##
## Number of controls = 2
## No. of controls exposed
## No. of cases exposed 0 1 2
## 0 0 0 1
## 1 1 1 0
##
## Number of controls = 3
## No. of controls exposed
## No. of cases exposed 0 1 2 3
## 0 0 0 1 0
## 1 1 0 4 1
##
## Number of controls = 4
## No. of controls exposed
## No. of cases exposed 0 1 2 3 4
## 0 0 0 0 0 1
## 1 0 0 1 4 0
##
## Number of controls = 5
## No. of controls exposed
## No. of cases exposed 0 1 2 3 4 5
## 0 0 1 0 0 0 0
## 1 0 1 0 1 0 0
##
## Number of controls = 6
## No. of controls exposed
## No. of cases exposed 0 1 2 3 4 5 6
## 0 0 0 0 1 0 0 0
## 1 0 0 0 0 0 1 2
##
## Odds ratio by Mantel-Haenszel method = 1.988
##
## Odds ratio by maximum likelihood estimate (MLE) method = 2.066
## 95%CI= 0.678 , 6.301
##
matchTab(VC1to6$case, VC1to6$alcohol, strata = VC1to6$matset)
##
## Exposure status: $ = 1
##
## Exposure status: VC1to6 = 1
##
## Exposure status: alcohol = 1
##
## Total number of match sets in the tabulation = 26
##
## Number of controls = 1
## No. of controls exposed
## No. of cases exposed 0 1
## 0 2 0
## 1 1 0
##
## Number of controls = 2
## No. of controls exposed
## No. of cases exposed 0 1 2
## 0 0 0 1
## 1 2 0 0
##
## Number of controls = 3
## No. of controls exposed
## No. of cases exposed 0 1 2 3
## 0 2 2 0 0
## 1 1 1 1 0
##
## Number of controls = 4
## No. of controls exposed
## No. of cases exposed 0 1 2 3 4
## 0 0 0 1 0 0
## 1 0 2 2 1 0
##
## Number of controls = 5
## No. of controls exposed
## No. of cases exposed 0 1 2 3 4 5
## 0 1 0 0 0 0 0
## 1 1 0 1 0 0 0
##
## Number of controls = 6
## No. of controls exposed
## No. of cases exposed 0 1 2 3 4 5 6
## 0 0 0 0 0 0 0 0
## 1 0 0 2 1 0 0 1
##
## Odds ratio by Mantel-Haenszel method = 5.386
##
## Odds ratio by maximum likelihood estimate (MLE) method = 5.655
## 95%CI= 1.811 , 17.659
##